import numpy as np
import pandas as pd
from linearmodels.iv import IV2SLS
from pathlib import Path


MISSING_CODES = {7, 8, 9, 77, 88, 99, 777, 888, 999, 7777, 8888, 9999}


def clean_numeric(series: pd.Series) -> pd.Series:
    s = pd.to_numeric(series, errors="coerce")
    s = s.mask(s.isin(MISSING_CODES))
    return s


def recode_yes_no(series: pd.Series) -> pd.Series:
    s = clean_numeric(series)
    # ESS participation items are typically: 1 Yes, 2 No, other codes are missing.
    return s.map({1: 1.0, 2: 0.0})


def zscore(series: pd.Series) -> pd.Series:
    s = clean_numeric(series)
    m = s.mean()
    sd = s.std(ddof=0)
    if pd.isna(sd) or sd <= 0:
        return s * 0
    return (s - m) / sd


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    ess = pd.read_parquet(root / "outputs" / "ess_r8_r11_min.parquet")
    bb = pd.read_csv(root / "data_external" / "broadband_country_year.csv")

    bb["year"] = pd.to_numeric(bb["year"], errors="coerce")
    df = ess.merge(bb, left_on=["cntry", "survey_year"], right_on=["cntry", "year"], how="left")

    # Build a simple composite instrument (country-year), then let IV2SLS handle the first stage.
    df["z_infra_index"] = (zscore(df["fixed_broadband_per100"]) + zscore(df["internet_users_pct"])) / 2

    # Endogenous internet use
    df["netusoft"] = clean_numeric(df["netusoft"])

    # Outcomes (binary 0/1)
    df["y_vote"] = recode_yes_no(df["vote"])
    df["y_sgnptit"] = recode_yes_no(df["sgnptit"])
    df["y_pbldmn"] = recode_yes_no(df["pbldmn"])
    df["y_bctprd"] = recode_yes_no(df["bctprd"])
    df["y_contplt"] = recode_yes_no(df["contplt"])
    df["y_badge"] = recode_yes_no(df["badge"])
    df["y_pstplonl"] = recode_yes_no(df["pstplonl"]) if "pstplonl" in df.columns else np.nan

    # A composite participation index (mean of available noninstitutional items)
    part_cols = ["y_sgnptit", "y_pbldmn", "y_bctprd", "y_contplt", "y_badge", "y_pstplonl"]
    df["y_part_index"] = df[part_cols].mean(axis=1, skipna=True)
    df.loc[df[part_cols].isna().all(axis=1), "y_part_index"] = np.nan

    # Controls (minimal)
    df["agea"] = clean_numeric(df["agea"])
    df["gndr"] = clean_numeric(df["gndr"])
    df["eduyrs"] = clean_numeric(df["eduyrs"])
    df["hinctnta"] = clean_numeric(df["hinctnta"])

    # Keep a clean analysis sample
    base_cols = ["cntry", "survey_year", "z_infra_index", "netusoft", "agea", "gndr", "eduyrs", "hinctnta"]
    out_path = root / "outputs" / "iv_country_year_baseline.txt"

    results = []
    for y, y_label in [("y_part_index", "Participation index (mean of items)"), ("y_vote", "Voted")]:
        use = df[base_cols + [y]].dropna().copy()
        # Fixed effects via dummies (small: 34 countries, 4 years)
        use = pd.get_dummies(use, columns=["cntry", "survey_year"], drop_first=True)

        yv = use[y]
        endog = use["netusoft"]
        instr = use["z_infra_index"]
        exog = use.drop(columns=[y, "netusoft", "z_infra_index"])

        mod = IV2SLS(yv, exog=exog, endog=endog, instruments=instr)
        res = mod.fit(cov_type="clustered", clusters=use.filter(like="cntry_").idxmax(axis=1))
        results.append((y_label, res))

    lines = []
    lines.append("IV baseline (country-year instrument index; clustered SE by country)")
    lines.append(f"Data: outputs/ess_r8_r11_min.parquet merged with data_external/broadband_country_year.csv")
    lines.append("")
    for label, res in results:
        lines.append(f"== {label} ==")
        lines.append(str(res.summary))
        lines.append("")

    out_path.write_text("\n".join(lines), encoding="utf-8")
    print(f"Wrote: {out_path}")


if __name__ == "__main__":
    main()

